Purpose

Look at the sampling effort for the various WQ parameters to be included in the long-term WQ publication. Here are some notes and issues to consider:

Notable differences between raw_chla_1975_2021 (from DroughtData) and the data used by the Primary Producers group:

  1. raw_chla_1975_2021 includes the four EZ stations from EMP
  2. raw_chla_1975_2021 has data for November 2021
  3. raw_chla_1975_2021 has additional Chlorophyll_Sign column that indicates whether the chlorophyll-a value is above or below the RL. For the 3 values that are below the RL, raw_chla_1975_2021 reports them as equal to the RL, while the data from the Primary Producers group reports them as half the RL.

Things to consider with the nutrient, chlorophyll-a, and WQ data sets:

  1. The chlorophyll-a data set includes the Grant Line Canal and Old River subregion. Should this region be included in the nutrient and WQ data sets?
  2. The chlorophyll-a data set is not filtered to remove records where Long_term == FALSE, while those records are removed from the nutrient and WQ data sets. We may want to make these all consistent.

So far, these are the surveys included in this summary:

  • Water Temperature, Salinity, Secchi depth: EMP, STN, FMWT
  • Nutrients: EMP, USGS_SFBS, USGS_CAWSC (just 11447650 - Sacramento River at Freeport)
  • Chlorophyll-a: EMP, USGS_SFBS

Global code and functions

# Load packages
library(tidyverse)
library(lubridate)
library(hms)
library(scales)
library(DroughtData)
library(discretewq)
library(deltamapr)
library(sf)
library(dtplyr)
# Run session info to display package versions
devtools::session_info()
## ─ Session info ───────────────────────────────────────────────────────────────
##  setting  value
##  version  R version 4.2.1 (2022-06-23 ucrt)
##  os       Windows 10 x64 (build 19044)
##  system   x86_64, mingw32
##  ui       RTerm
##  language (EN)
##  collate  English_United States.utf8
##  ctype    English_United States.utf8
##  tz       America/Los_Angeles
##  date     2022-08-10
##  pandoc   2.18 @ C:/Program Files/RStudio/bin/quarto/bin/tools/ (via rmarkdown)
## 
## ─ Packages ───────────────────────────────────────────────────────────────────
##  package       * version    date (UTC) lib source
##  assertthat      0.2.1      2019-03-21 [1] CRAN (R 4.2.1)
##  backports       1.4.1      2021-12-13 [1] CRAN (R 4.2.0)
##  broom           1.0.0      2022-07-01 [1] CRAN (R 4.2.1)
##  bslib           0.4.0      2022-07-16 [1] CRAN (R 4.2.1)
##  cachem          1.0.6      2021-08-19 [1] CRAN (R 4.2.1)
##  callr           3.7.1      2022-07-13 [1] CRAN (R 4.2.1)
##  cellranger      1.1.0      2016-07-27 [1] CRAN (R 4.2.1)
##  class           7.3-20     2022-01-16 [2] CRAN (R 4.2.1)
##  classInt        0.4-7      2022-06-10 [1] CRAN (R 4.2.1)
##  cli             3.3.0      2022-04-25 [1] CRAN (R 4.2.1)
##  colorspace      2.0-3      2022-02-21 [1] CRAN (R 4.2.1)
##  crayon          1.5.1      2022-03-26 [1] CRAN (R 4.2.1)
##  data.table      1.14.2     2021-09-27 [1] CRAN (R 4.2.1)
##  DBI             1.1.3      2022-06-18 [1] CRAN (R 4.2.1)
##  dbplyr          2.2.1      2022-06-27 [1] CRAN (R 4.2.1)
##  deltamapr     * 1.0.0      2021-06-18 [1] Github (InteragencyEcologicalProgram/deltamapr@d0a6f9c)
##  devtools        2.4.4      2022-07-20 [1] CRAN (R 4.2.1)
##  digest          0.6.29     2021-12-01 [1] CRAN (R 4.2.1)
##  discretewq    * 2.3.2      2022-08-03 [1] Github (sbashevkin/discretewq@99b3ce8)
##  dplyr         * 1.0.9      2022-04-28 [1] CRAN (R 4.2.1)
##  DroughtData   * 1.1.0.9000 2022-06-21 [1] local
##  dtplyr        * 1.2.1      2022-01-19 [1] CRAN (R 4.2.1)
##  e1071           1.7-11     2022-06-07 [1] CRAN (R 4.2.1)
##  ellipsis        0.3.2      2021-04-29 [1] CRAN (R 4.2.1)
##  evaluate        0.15       2022-02-18 [1] CRAN (R 4.2.1)
##  fansi           1.0.3      2022-03-24 [1] CRAN (R 4.2.1)
##  fastmap         1.1.0      2021-01-25 [1] CRAN (R 4.2.1)
##  forcats       * 0.5.1      2021-01-27 [1] CRAN (R 4.2.1)
##  fs              1.5.2      2021-12-08 [1] CRAN (R 4.2.1)
##  gargle          1.2.0      2021-07-02 [1] CRAN (R 4.2.1)
##  generics        0.1.3      2022-07-05 [1] CRAN (R 4.2.1)
##  ggplot2       * 3.3.6      2022-05-03 [1] CRAN (R 4.2.1)
##  glue            1.6.2      2022-02-24 [1] CRAN (R 4.2.1)
##  googledrive     2.0.0      2021-07-08 [1] CRAN (R 4.2.1)
##  googlesheets4   1.0.0      2021-07-21 [1] CRAN (R 4.2.1)
##  gtable          0.3.0      2019-03-25 [1] CRAN (R 4.2.1)
##  haven           2.5.0      2022-04-15 [1] CRAN (R 4.2.1)
##  hms           * 1.1.1      2021-09-26 [1] CRAN (R 4.2.1)
##  htmltools       0.5.3      2022-07-18 [1] CRAN (R 4.2.1)
##  htmlwidgets     1.5.4      2021-09-08 [1] CRAN (R 4.2.1)
##  httpuv          1.6.5      2022-01-05 [1] CRAN (R 4.2.1)
##  httr            1.4.3      2022-05-04 [1] CRAN (R 4.2.1)
##  jquerylib       0.1.4      2021-04-26 [1] CRAN (R 4.2.1)
##  jsonlite        1.8.0      2022-02-22 [1] CRAN (R 4.2.1)
##  KernSmooth      2.23-20    2021-05-03 [2] CRAN (R 4.2.1)
##  knitr           1.39       2022-04-26 [1] CRAN (R 4.2.1)
##  later           1.3.0      2021-08-18 [1] CRAN (R 4.2.1)
##  lifecycle       1.0.1      2021-09-24 [1] CRAN (R 4.2.1)
##  lubridate     * 1.8.0      2021-10-07 [1] CRAN (R 4.2.1)
##  magrittr        2.0.3      2022-03-30 [1] CRAN (R 4.2.1)
##  memoise         2.0.1      2021-11-26 [1] CRAN (R 4.2.1)
##  mime            0.12       2021-09-28 [1] CRAN (R 4.2.0)
##  miniUI          0.1.1.1    2018-05-18 [1] CRAN (R 4.2.1)
##  modelr          0.1.8      2020-05-19 [1] CRAN (R 4.2.1)
##  munsell         0.5.0      2018-06-12 [1] CRAN (R 4.2.1)
##  pillar          1.8.0      2022-07-18 [1] CRAN (R 4.2.1)
##  pkgbuild        1.3.1      2021-12-20 [1] CRAN (R 4.2.1)
##  pkgconfig       2.0.3      2019-09-22 [1] CRAN (R 4.2.1)
##  pkgload         1.3.0      2022-06-27 [1] CRAN (R 4.2.1)
##  prettyunits     1.1.1      2020-01-24 [1] CRAN (R 4.2.1)
##  processx        3.7.0      2022-07-07 [1] CRAN (R 4.2.1)
##  profvis         0.3.7      2020-11-02 [1] CRAN (R 4.2.1)
##  promises        1.2.0.1    2021-02-11 [1] CRAN (R 4.2.1)
##  proxy           0.4-27     2022-06-09 [1] CRAN (R 4.2.1)
##  ps              1.7.1      2022-06-18 [1] CRAN (R 4.2.1)
##  purrr         * 0.3.4      2020-04-17 [1] CRAN (R 4.2.1)
##  R6              2.5.1      2021-08-19 [1] CRAN (R 4.2.1)
##  Rcpp            1.0.9      2022-07-08 [1] CRAN (R 4.2.1)
##  readr         * 2.1.2      2022-01-30 [1] CRAN (R 4.2.1)
##  readxl          1.4.0      2022-03-28 [1] CRAN (R 4.2.1)
##  remotes         2.4.2      2021-11-30 [1] CRAN (R 4.2.1)
##  reprex          2.0.1      2021-08-05 [1] CRAN (R 4.2.1)
##  rlang           1.0.4      2022-07-12 [1] CRAN (R 4.2.1)
##  rmarkdown       2.14       2022-04-25 [1] CRAN (R 4.2.1)
##  rstudioapi      0.13       2020-11-12 [1] CRAN (R 4.2.1)
##  rvest           1.0.2      2021-10-16 [1] CRAN (R 4.2.1)
##  sass            0.4.2      2022-07-16 [1] CRAN (R 4.2.1)
##  scales        * 1.2.0      2022-04-13 [1] CRAN (R 4.2.1)
##  sessioninfo     1.2.2      2021-12-06 [1] CRAN (R 4.2.1)
##  sf            * 1.0-8      2022-07-14 [1] CRAN (R 4.2.1)
##  shiny           1.7.2      2022-07-19 [1] CRAN (R 4.2.1)
##  stringi         1.7.8      2022-07-11 [1] CRAN (R 4.2.1)
##  stringr       * 1.4.0      2019-02-10 [1] CRAN (R 4.2.1)
##  tibble        * 3.1.8      2022-07-22 [1] CRAN (R 4.2.1)
##  tidyr         * 1.2.0      2022-02-01 [1] CRAN (R 4.2.1)
##  tidyselect      1.1.2      2022-02-21 [1] CRAN (R 4.2.1)
##  tidyverse     * 1.3.2      2022-07-18 [1] CRAN (R 4.2.1)
##  tzdb            0.3.0      2022-03-28 [1] CRAN (R 4.2.1)
##  units           0.8-0      2022-02-05 [1] CRAN (R 4.2.1)
##  urlchecker      1.0.1      2021-11-30 [1] CRAN (R 4.2.1)
##  usethis         2.1.6      2022-05-25 [1] CRAN (R 4.2.1)
##  utf8            1.2.2      2021-07-24 [1] CRAN (R 4.2.1)
##  vctrs           0.4.1      2022-04-13 [1] CRAN (R 4.2.1)
##  withr           2.5.0      2022-03-03 [1] CRAN (R 4.2.1)
##  xfun            0.31       2022-05-10 [1] CRAN (R 4.2.1)
##  xml2            1.3.3      2021-11-30 [1] CRAN (R 4.2.1)
##  xtable          1.8-4      2019-04-21 [1] CRAN (R 4.2.1)
##  yaml            2.3.5      2022-02-21 [1] CRAN (R 4.2.1)
## 
##  [1] C:/R/win-library/4.2
##  [2] C:/Program Files/R/R-4.2.1/library
## 
## ──────────────────────────────────────────────────────────────────────────────

Import and Prepare Data

# Define factor level for Region
region_lev <- c(
  "North",
  "SouthCentral",
  "Confluence",
  "Suisun Bay",
  "Suisun Marsh"
)

# Create a nested data frame and prepare to run the summary plot function on - data in DroughtData
ndf_lt_wq <- 
  tibble(
    Parameter = c(
      "Temperature",
      "Salinity",
      "Secchi",
      "DissAmmonia",
      "DissNitrateNitrite",
      "DissOrthophos",
      "Chlorophyll"
    ),
    df_data = c(
      rep(list(raw_wq_1975_2021), 3),
      rep(list(raw_nutr_1975_2021), 3),
      list(raw_chla_1975_2021)
    )
  ) %>% 
  mutate(
    df_data_c = map2(
      df_data, 
      Parameter,
      ~ drop_na(.x, all_of(.y)) %>% 
        mutate(
          Month = fct_rev(month(Date, label = TRUE)),
          Region = factor(Region, levels = region_lev)
        )
    )
  )

# Pull in and prepare the WQ data from discretewq
df_dwq <-
  wq(
    Sources = c(
      "EMP",
      "STN",
      "FMWT",
      "EDSM",
      "DJFMP",
      "SDO",
      "SKT",
      "SLS",
      "20mm",
      "Suisun",
      "Baystudy",
      "USBR",
      "USGS_SFBS",
      "YBFMP",
      "USGS_CAWSC"
    )
  ) %>% 
  transmute(
    Source,
    Station,
    Latitude,
    Longitude,
    Date = date(Date),
    # Convert Datetime to PST
    Datetime = with_tz(Datetime, tzone = "Etc/GMT+8"),
    Temperature,
    Salinity,
    Secchi,
    Chlorophyll,
    DissAmmonia,
    DissNitrateNitrite,
    DissOrthophos
  ) %>% 
  filter(
    !if_all(
      c(Temperature, Salinity, Secchi, Chlorophyll, DissAmmonia, DissNitrateNitrite, DissOrthophos),
      is.na
    )
  )

Data in DroughtData

Create Plots

# Create function for sampling effort plots
plot_samp_effort <- function(df, grp_var, yr_var) {
  df %>% 
    count(.data[[yr_var]], Month, .data[[grp_var]], name = "num_samples") %>% 
    ggplot(aes(x = .data[[yr_var]], y = Month, fill = num_samples)) +
    geom_tile() +
    facet_grid(rows = vars(.data[[grp_var]]), drop = FALSE) +
    scale_x_continuous(breaks = breaks_pretty(20), expand = expansion()) +
    scale_y_discrete(drop = FALSE) +
    scale_fill_viridis_c(name = "Number of Samples") +
    theme_bw() +
    theme(
      axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5),
      legend.position = "top"
    )
}
# Create plots for each Parameter
ndf_lt_wq_plt <- ndf_lt_wq %>% 
  mutate(plt = map(df_data_c, .f = plot_samp_effort, grp_var = "Region", yr_var = "YearAdj"))

Sampling Effort

Temperature

Salinity

Secchi

DissAmmonia

DissNitrateNitrite

DissOrthophos

Chlorophyll

Secchi Sampling Effort for North region

The secchi depth data has a large hole from Jan-Aug in 2017-2021 in the North region, which seems suspicious. Let’s look at the sampling effort for secchi for each station in the North region.

ndf_lt_wq$df_data_c[[3]] %>% 
  filter(Region == "North") %>%
  plot_samp_effort(grp_var = "Station", yr_var = "YearAdj")

Hmmm, the problem seems to be coming from the one current EMP station (C3A) in the North region. Let’s look at the data directly from the discretewq package (version 2.3.2) to see if other current EMP stations have WQ data during the same time period.

WQ Sampling Effort for current EMP stations

# Import EMP station metadata from most recent EDI publication to use for status designation
df_emp_meta <- read_csv("https://portal.edirepository.org/nis/dataviewer?packageid=edi.458.6&entityid=ecf241d54a8335a49f8dfc8813d75609")

# Create vector of stations with active status
emp_sta_active <- df_emp_meta %>% 
  filter(Status == "Active") %>% 
  pull(Station)

# Pull out water quality data from EMP dataset within discretewq
emp_wq <- discretewq::EMP %>% 
  transmute(
    Station,
    Date = date(Date),
    Year = year(Date),
    Month = fct_rev(month(Date, label = TRUE)),
    Conductivity,
    Temperature,
    Secchi
  ) %>% 
  # Only include active stations
  filter(Station %in% emp_sta_active)

# Create a nested data frame and run the summary plot function on each WQ parameter
ndf_emp_wq <- 
  tibble(
    Parameter = c(
      "Conductivity",
      "Temperature",
      "Secchi"
    ),
    df_data = rep(list(emp_wq), 3)
  ) %>% 
  mutate(
    df_data_c = map2(df_data, Parameter, ~ drop_na(.x, all_of(.y))),
    plt = map(df_data_c, .f = plot_samp_effort, grp_var = "Station", yr_var = "Year")
  )

It looks like only the shore-based stations (C3A, C10A, C9) have the issue of missing secchi depth data in recent years. Sarah Perry confirmed that EMP doesn’t collect secchi depth data at their shore-based stations anymore, so that explains what’s going on here.

Conductivity

Temperature

Secchi

Data in discretewq

Now that we’ve been making updates to the WQ data in the discretewq package, let’s take a look at which surveys we can use for the long-term WQ publication. First, we’ll look at the temporal scale of all of the surveys available.

df_dwq %>% 
  group_by(Source) %>% 
  summarize(min_date = min(Date), max_date = max(Date)) %>% 
  arrange(min_date)
## # A tibble: 15 × 3
##    Source     min_date   max_date  
##    <chr>      <date>     <date>    
##  1 FMWT       1967-09-12 2021-12-16
##  2 USGS_SFBS  1969-04-10 2022-05-09
##  3 STN        1969-07-05 2021-08-19
##  4 USGS_CAWSC 1971-03-02 2021-03-10
##  5 EMP        1975-01-07 2021-12-16
##  6 DJFMP      1976-05-13 2021-12-29
##  7 Suisun     1979-05-16 2021-08-25
##  8 Baystudy   1980-01-23 2020-12-01
##  9 20mm       1995-04-24 2021-07-16
## 10 SDO        1997-08-04 2018-11-08
## 11 YBFMP      1998-01-19 2021-12-30
## 12 SKT        2002-01-07 2021-04-29
## 13 SLS        2009-01-05 2021-03-17
## 14 USBR       2012-05-08 2019-10-22
## 15 EDSM       2016-12-15 2021-11-26

Overall, for all parameters, it looks like FMWT, USGS_SFBS, STN, USGS_CAWSC, EMP, DJFMP, Suisun, and Baystudy have adequate temporal coverage for the long-term analysis. Now let’s take a look at the spatial coverage.

# Only include surveys with adequate temporal coverage
df_dwq_lt <- df_dwq %>% 
  filter(Source %in% c("FMWT", "USGS_SFBS", "STN", "USGS_CAWSC", "EMP", "DJFMP", "Suisun", "Baystudy"))

# Pull out station coordinates and convert to sf object
sf_dwq_lt <- df_dwq_lt %>% 
  distinct(Source, Station, Latitude, Longitude) %>% 
  drop_na(Latitude, Longitude) %>% 
  st_as_sf(coords = c("Longitude", "Latitude"), crs = 4326) %>% 
  st_transform(st_crs(R_EDSM_Subregions_Mahardja_FLOAT))

# Load Delta Shapefile from Brian and only keep SubRegions east of Carquinez Straight
sf_delta <- R_EDSM_Subregions_Mahardja_FLOAT %>% 
  filter(
    !SubRegion %in% c(
      "Carquinez Strait", 
      "Lower Napa River", 
      "San Francisco Bay",
      "San Pablo Bay",
      "South Bay",
      "Upper Napa River" 
    )
  ) %>% 
  select(SubRegion)

# Plot all stations over the SubRegions
ggplot() +
  geom_sf(data = sf_delta, fill = "green", alpha = 0.5) +
  geom_sf(data = sf_dwq_lt)

# Assign SubRegions to the stations
sf_dwq_lt_reg <- sf_dwq_lt %>% 
  st_join(sf_delta, join = st_intersects) %>%
  filter(!is.na(SubRegion))

# Remove SubRegions without stations from sf_delta
sf_delta_c <- sf_delta %>% filter(SubRegion %in% unique(sf_dwq_lt_reg$SubRegion))

# Plot all stations over the SubRegions again - with limited ranges
ggplot() +
  geom_sf(data = sf_delta_c, fill = "blue", alpha = 0.5) +
  geom_sf(data = sf_dwq_lt_reg)

All but one of the SubRegions east of Carquinez Straight has at least one station from a long term survey within it. Now let’s take a closer look at the temporal data coverage for each Station and parameter.

# Prepare long-term data from discretewq
df_dwq_lt_c <- df_dwq_lt %>% 
  # Remove records without latitude-longitude coordinates
  drop_na(Latitude, Longitude) %>%
  # Convert to sf object
  st_as_sf(coords = c("Longitude", "Latitude"), crs = 4326, remove = FALSE) %>%
  # Change to crs of sf_delta_c
  st_transform(crs = st_crs(sf_delta_c)) %>%
  # Add subregions
  st_join(sf_delta_c, join = st_intersects) %>%
  # Remove any data outside our subregions of interest
  filter(!is.na(SubRegion)) %>%
  # Drop sf geometry column since it's no longer needed
  st_drop_geometry() %>% 
  # Add variables for adjusted calendar year, month, and season
    # Adjusted calendar year: December-November, with December of the previous calendar year
    # included with the following year
  mutate(
    Month = month(Date),
    YearAdj = if_else(Month == 12, year(Date) + 1, year(Date)),
    Season = case_when(
      Month %in% 3:5 ~ "Spring",
      Month %in% 6:8 ~ "Summer",
      Month %in% 9:11 ~ "Fall",
      Month %in% c(12, 1, 2) ~ "Winter"
    )
  ) %>% 
  # Restrict data to 1975-2021
  filter(YearAdj %in% 1975:2021)

Sampling Effort by Station

plot_samp_effort_sta <- function(df, param) {
  df_param <- df %>% drop_na(.data[[param]])
  
  # Look for any instances when more than 1 data point was collected at a station-day
  df_dups <- df_param %>%
    count(Station, Date) %>% 
    filter(n > 1) %>% 
    select(-n)
  
  # Fix duplicates if there are any
  if (length(df_dups > 0)) {
    df_dups_fixed <- df_param %>%
      inner_join(df_dups, by = c("Station", "Date")) %>%
      drop_na(Datetime) %>%
      mutate(
        # Create variable for time
        Time = as_hms(Datetime),
        # Calculate difference from noon for each data point for later filtering
        Noon_diff = abs(hms(hours = 12) - Time)
      ) %>%
      # Use dtplyr to speed up operations
      lazy_dt() %>%
      group_by(Station, Date) %>%
      # Select only 1 data point per station and date, choose data closest to noon
      filter(Noon_diff == min(Noon_diff)) %>%
      # When points are equidistant from noon, select earlier point
      filter(Time == min(Time)) %>%
      ungroup() %>%
      # End dtplyr operation
      as_tibble() %>%
      select(-c(Time, Noon_diff))
    
    df_param <- df_param %>% 
      anti_join(df_dups, by = c("Station", "Date")) %>%
      bind_rows(df_dups_fixed)
  }
  
  # Create summary heat map of sampling effort for each station
  df_param %>%
    count(Station, YearAdj, name = "num_samples") %>%
    ggplot(aes(x = YearAdj, y = Station, fill = num_samples)) +
    geom_tile() +
    scale_x_continuous(breaks = breaks_pretty(20), expand = expansion()) +
    scale_fill_viridis_c(name = "Number of Samples") +
    theme_bw() +
    theme(
      axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5),
      legend.position = "top"
    )
}
# Nest long-term data from discretewq by Source
ndf_dwq_lt_c <- df_dwq_lt_c %>% nest(df_data = -Source)

# Create a nested data frame to run the summary plot function on
ndf_dwq_lt_c <- 
  tibble(
    Parameter = c(
      "Temperature",
      "Salinity",
      "Secchi",
      "DissAmmonia",
      "DissNitrateNitrite",
      "DissOrthophos",
      "Chlorophyll"
    ),
    df_data = rep(list(ndf_dwq_lt_c), 7)
  ) %>% 
  unnest(df_data)

# Create plots for each Parameter and Source
ndf_dwq_lt_plt <- ndf_dwq_lt_c %>% 
  mutate(plt = map2(df_data, Parameter, .f = plot_samp_effort_sta)) %>% 
  select(-df_data) %>% 
  nest(plts = -Parameter)

Temperature

FMWT

Baystudy

STN

Suisun

DJFMP

EMP

USGS_SFBS

USGS_CAWSC

Salinity

FMWT

Baystudy

STN

Suisun

DJFMP

EMP

USGS_SFBS

USGS_CAWSC

Secchi

FMWT

Baystudy

STN

Suisun

DJFMP

EMP

USGS_SFBS

USGS_CAWSC

DissAmmonia

FMWT

Baystudy

STN

Suisun

DJFMP

EMP

USGS_SFBS

USGS_CAWSC

DissNitrateNitrite

FMWT

Baystudy

STN

Suisun

DJFMP

EMP

USGS_SFBS

USGS_CAWSC

DissOrthophos

FMWT

Baystudy

STN

Suisun

DJFMP

EMP

USGS_SFBS

USGS_CAWSC

Chlorophyll

FMWT

Baystudy

STN

Suisun

DJFMP

EMP

USGS_SFBS

USGS_CAWSC